{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/lightgbm/__init__.py:48: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.\n",
      "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
      "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
      "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
      "  \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np  # linear algebra\n",
    "import pandas as pd  #\n",
    "from datetime import datetime\n",
    "from scipy.stats import skew  # for some statistics\n",
    "from scipy.special import boxcox1p\n",
    "from scipy.stats import boxcox_normmax\n",
    "from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV\n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "from sklearn.svm import SVR\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.preprocessing import RobustScaler\n",
    "from sklearn.model_selection import KFold, cross_val_score\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from mlxtend.regressor import StackingCVRegressor\n",
    "from xgboost import XGBRegressor\n",
    "from lightgbm import LGBMRegressor\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from scipy.stats import norm\n",
    "from scipy import stats\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv('./Output/csv/train_engineer.csv')\n",
    "df_test = pd.read_csv('./Output/csv/test_engineer.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "target = df_train['SalePrice']\n",
    "df_train.drop(['SalePrice'], axis = 1, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "overfit = []\n",
    "for i in df_train.columns:\n",
    "    counts = df_train[i].value_counts()\n",
    "    zeros = counts.iloc[0]\n",
    "    if zeros / len(df_train) * 100 > 99.94:\n",
    "        overfit.append(i)\n",
    "df_train = df_train.drop(overfit, axis=1)\n",
    "df_test = df_test.drop(overfit, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# kfold is 10\n",
    "kfolds = KFold(n_splits=10, shuffle=True, random_state=42)\n",
    "# make the function to get the score\n",
    "def rmsle(y, y_pred):\n",
    "    return np.sqrt(mean_squared_error(y, y_pred))\n",
    "def cv_rmse(model, X=df_train):\n",
    "    rmse = np.sqrt(-cross_val_score(model, X, target, scoring=\"neg_mean_squared_error\", cv=kfolds))\n",
    "    return (rmse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# setup models\n",
    "# all the parameter is based on existed data processes\n",
    "alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]\n",
    "alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]\n",
    "e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]\n",
    "e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]\n",
    "\n",
    "ridge = make_pipeline(RobustScaler(),\n",
    "                      RidgeCV(alphas=alphas_alt, cv=kfolds))\n",
    "\n",
    "lasso = make_pipeline(RobustScaler(),\n",
    "                      LassoCV(max_iter=1e7, alphas=alphas2,\n",
    "                              random_state=42, cv=kfolds))\n",
    "\n",
    "elasticnet = make_pipeline(RobustScaler(),\n",
    "                           ElasticNetCV(max_iter=1e7, alphas=e_alphas,\n",
    "                                        cv=kfolds, l1_ratio=e_l1ratio))\n",
    "                                        \n",
    "svr = make_pipeline(RobustScaler(),\n",
    "                      SVR(C= 20, epsilon= 0.008, gamma=0.0003,))\n",
    "\n",
    "\n",
    "gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,\n",
    "                                   max_depth=4, max_features='sqrt',\n",
    "                                   min_samples_leaf=15, min_samples_split=10, \n",
    "                                   loss='huber', random_state =42)\n",
    "                                   \n",
    "\n",
    "lightgbm = LGBMRegressor(objective='regression', \n",
    "                                       num_leaves=4,\n",
    "                                       learning_rate=0.01, \n",
    "                                       n_estimators=5000,\n",
    "                                       max_bin=200, \n",
    "                                       bagging_fraction=0.75,\n",
    "                                       bagging_freq=5, \n",
    "                                       bagging_seed=7,\n",
    "                                       feature_fraction=0.2,\n",
    "                                       feature_fraction_seed=7,\n",
    "                                       verbose=-1,\n",
    "                                       #min_data_in_leaf=2,\n",
    "                                       #min_sum_hessian_in_leaf=11\n",
    "                                       )\n",
    "                                       \n",
    "\n",
    "xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460,\n",
    "                                     max_depth=3, min_child_weight=0,\n",
    "                                     gamma=0, subsample=0.7,\n",
    "                                     colsample_bytree=0.7,\n",
    "                                     objective='reg:linear', nthread=-1,\n",
    "                                     scale_pos_weight=1, seed=27,\n",
    "                                     reg_alpha=0.00006)\n",
    "\n",
    "# stack\n",
    "stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet,\n",
    "                                            gbr, xgboost, lightgbm),\n",
    "                                meta_regressor=xgboost,\n",
    "                                use_features_in_secondary=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.12086470974541093"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# ridge\n",
    "# linear regreesion without feature importance\n",
    "score = cv_rmse(ridge)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.11846931843258668"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lasso\n",
    "# linear regreesion with no feature importance\n",
    "score = cv_rmse(lasso)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.11864749467553612"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# elastic net\n",
    "# linear model with no feature importance\n",
    "score = cv_rmse(elasticnet)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.12068343273696454"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# svr\n",
    "score = cv_rmse(svr)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.1366595518206411"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor\n",
    "# random Forest Regression\n",
    "forest_reg = RandomForestRegressor(n_estimators=40, random_state=42, \n",
    "                                   min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)\n",
    "score = cv_rmse(forest_reg)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.12087977828691798"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lightgbm\n",
    "score = cv_rmse(lightgbm)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.12042945359860377"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# gbr\n",
    "score = cv_rmse(gbr)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[21:42:53] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[21:43:37] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[21:44:21] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[21:45:05] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[21:45:48] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[21:46:32] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[21:47:15] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[21:47:57] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[21:48:39] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[21:49:23] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.11833366806900762"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# xgboost\n",
    "score = cv_rmse(xgboost)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot feature importance\n",
    "def plot_feature_importance(model, df):\n",
    "    feature_importance = model.feature_importances_[:30]\n",
    "    # make importances relative to max importance\n",
    "    plt.figure(figsize=(20, 20)) #figure size\n",
    "    #making it a percentage relative to the max value\n",
    "    feature_importance = 100.0 * (feature_importance / feature_importance.max()) \n",
    "    sorted_idx = np.argsort(feature_importance)\n",
    "    pos = np.arange(sorted_idx.shape[0]) + .5\n",
    "    plt.barh(pos, feature_importance[sorted_idx], align='center')\n",
    "    #used train_drop here to show the name of each feature instead of our train_prepared\n",
    "    plt.yticks(pos, df.columns[sorted_idx], fontsize=15) \n",
    "    plt.xlabel('Relative Importance', fontsize=20)\n",
    "    plt.ylabel('Features', fontsize=20)\n",
    "    plt.title('Variable Importance', fontsize=30)\n",
    "    return pd.DataFrame({'columns_name': df.columns, 'feature_importance': model.feature_importances_})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "stack_gen_model = stack_gen.fit(np.array(df_train), np.array(target))\n",
    "submission = pd.read_csv(\"./Resources/sample_submission.csv\")\n",
    "submission.iloc[:,1] = np.floor(np.expm1(stack_gen_model.predict(np.array(df_test))))\n",
    "submission.to_csv('./output/csv/stack_gen.csv', index = False)\n",
    "# 0.12029"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "elastic_model = elasticnet.fit(df_train, target)\n",
    "submission = pd.read_csv(\"./Resources/sample_submission.csv\")\n",
    "submission.iloc[:,1] = np.floor(np.expm1(elastic_model.predict(df_test)))\n",
    "submission.to_csv('./output/csv/elastic.csv', index = False)\n",
    "# 0.11941"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "lasso_model = lasso.fit(df_train, target)\n",
    "submission = pd.read_csv(\"./Resources/sample_submission.csv\")\n",
    "submission.iloc[:,1] = np.floor(np.expm1(lasso_model.predict(df_test)))\n",
    "submission.to_csv('./output/csv/lasso.csv', index = False)\n",
    "# 0.11941"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "ridge_model = ridge.fit(df_train, target)\n",
    "submission = pd.read_csv(\"./Resources/sample_submission.csv\")\n",
    "submission.iloc[:,1] = np.floor(np.expm1(ridge_model.predict(df_test)))\n",
    "submission.to_csv('./output/csv/ridge.csv', index = False)\n",
    "# 0.11926"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "svr_model = svr.fit(df_train, target)\n",
    "submission = pd.read_csv(\"./Resources/sample_submission.csv\")\n",
    "submission.iloc[:,1] = np.floor(np.expm1(svr_model.predict(df_test)))\n",
    "submission.to_csv('./output/csv/svr.csv', index = False)\n",
    "# 0.12029"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "forest_reg_model = forest_reg.fit(df_train, target)\n",
    "submission = pd.read_csv(\"./Resources/sample_submission.csv\")\n",
    "submission.iloc[:,1] = np.floor(np.expm1(forest_reg.predict(df_test)))\n",
    "submission.to_csv('./output/csv/forest.csv', index = False)\n",
    "# 0.14152"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "lightgbm_model = lightgbm.fit(df_train, target)\n",
    "submission = pd.read_csv(\"./Resources/sample_submission.csv\")\n",
    "submission.iloc[:,1] = np.floor(np.expm1(lightgbm.predict(df_test)))\n",
    "submission.to_csv('./output/csv/lightgbm.csv', index = False)\n",
    "# 0.12105"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "gbr_model = gbr.fit(df_train, target)\n",
    "submission = pd.read_csv(\"./Resources/sample_submission.csv\")\n",
    "submission.iloc[:,1] = np.floor(np.expm1(gbr_model.predict(df_test)))\n",
    "submission.to_csv('./output/csv/gbr.csv', index = False)\n",
    "# 0.12205"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgboost_model = xgboost.fit(df_train, target)\n",
    "submission = pd.read_csv(\"./Resources/sample_submission.csv\")\n",
    "submission.iloc[:,1] = np.floor(np.expm1(xgboost_model.predict(df_test)))\n",
    "submission.to_csv('./output/csv/xgboost.csv', index = False)\n",
    "# 0.12572"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
